(define-module (split-quality-measure))

(use-modules
 ;; SRFI-1 for additional list procedures
 (srfi srfi-1)
 (tree)
 (dataset)
 (data-point))

(define-public calc-proportion
  (lambda (subset class-label label-column-index)
    "Calculate the proportion of data points with the given label in the given
subset, compared to the data points with other labels."
    (cond
     [(dataset-empty? subset) 0]
     [else
      (let* ([row-count (dataset-length subset)]
             [class-count
              (count (lambda (data-point)
                       (= (data-point-get-col data-point label-column-index)
                          class-label))
                     subset)]
             [prop (/ class-count row-count)])
        (* prop (- 1.0 prop)))])))


;; The procedure gini-index is used to evaluate the quality of a split.  It is a
;; cost function for a split.  We want to keep the costs for splits low. (also:
;; greedy) There are other ways of calculating the quality of a split, but for
;; now we implement gini index.
(define-public gini-index
  (lambda (subsets label-column-index)
    "Calculate the gini index quality measure, based on the result of a split."
    (apply +
           (map (lambda (subset)
                  ;; For now assume labels are 0 or 1. Binary classification.
                  ;; FUTURE TODO: In the future one might make this more flexible
                  ;; by giving the labels as argument.
                  (let ([labels '(0 1)])
                    ;; For each subset calculate the proportion for each label.
                    (apply +
                           (map (lambda (label)
                                  (calc-proportion subset label label-column-index))
                                labels))))
                subsets))))
